The packages used in this project are: Rio: Chan et al. (2021) Readr: Wickham and Hester (2021) Haven: Wickham and Miller (2021)
dat <- import(here("data", "dat.csv")) %>%
clean_names() %>%
mutate_all(na_if,"")
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$survived <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Sweedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
datpass <- dat %>%
filter(category=="Passenger") %>%
select(survived, gender, class, age) %>%
na.omit()
# Breakdown of passengers by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(gender)) %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Gender", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Gender | Count | Percent |
|---|---|---|---|
| 1st Class | Female | 153 | 43.71 |
| 1st Class | Male | 197 | 56.29 |
| 2nd Class | Female | 112 | 38.36 |
| 2nd Class | Male | 180 | 61.64 |
| 3rd Class | Female | 216 | 30.47 |
| 3rd Class | Male | 493 | 69.53 |
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities",
col.names = c("Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Nationality | Count | Percent |
|---|---|---|
| English | 1037 | 42.36 |
| Irish | 361 | 14.75 |
| American | 246 | 10.05 |
| Other - Multiple | 116 | 4.74 |
| Swedish | 99 | 4.04 |
| Syrian/Lebanese | 86 | 3.51 |
| Finnish | 58 | 2.37 |
| Scottish | 49 | 2.00 |
| French | 44 | 1.80 |
| Italian | 41 | 1.67 |
| Canadian | 39 | 1.59 |
| Bulgarian | 33 | 1.35 |
| Croatian | 28 | 1.14 |
| Belgian | 26 | 1.06 |
| Norwegian | 26 | 1.06 |
| Channel Islander | 25 | 1.02 |
| Welsh | 23 | 0.94 |
| Swiss | 22 | 0.90 |
| German | 14 | 0.57 |
| Danish | 11 | 0.45 |
| Spanish | 9 | 0.37 |
| Australian | 7 | 0.29 |
| Polish | 6 | 0.25 |
| South African | 5 | 0.20 |
| Bosnian | 4 | 0.16 |
| Hong Kongese | 4 | 0.16 |
| Dutch | 3 | 0.12 |
| Greek | 3 | 0.12 |
| Lithuanian | 3 | 0.12 |
| Uruguayan | 3 | 0.12 |
| Chinese | 2 | 0.08 |
| Portuguese | 2 | 0.08 |
| Slovenian | 2 | 0.08 |
| Austrian | 1 | 0.04 |
| Cape Verdean | 1 | 0.04 |
| Egyptian | 1 | 0.04 |
| Guyanese | 1 | 0.04 |
| Haitian | 1 | 0.04 |
| Hungarian | 1 | 0.04 |
| Japanese | 1 | 0.04 |
| Latvian | 1 | 0.04 |
| Mexican | 1 | 0.04 |
| Sweedish | 1 | 0.04 |
| Turkish | 1 | 0.04 |
# Breakdown of passenger nationalities by class (all)
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Nationality | Count | Percent |
|---|---|---|---|
| 1st Class | American | 195 | 57.35 |
| 1st Class | English | 51 | 15.00 |
| 1st Class | Canadian | 27 | 7.94 |
| 1st Class | Other - Multiple | 14 | 4.12 |
| 1st Class | French | 10 | 2.94 |
| 1st Class | Irish | 6 | 1.76 |
| 1st Class | Swiss | 6 | 1.76 |
| 1st Class | German | 5 | 1.47 |
| 1st Class | Scottish | 5 | 1.47 |
| 1st Class | Spanish | 4 | 1.18 |
| 1st Class | Swedish | 4 | 1.18 |
| 1st Class | Uruguayan | 3 | 0.88 |
| 1st Class | Belgian | 2 | 0.59 |
| 1st Class | Italian | 2 | 0.59 |
| 1st Class | Channel Islander | 1 | 0.29 |
| 1st Class | Dutch | 1 | 0.29 |
| 1st Class | Egyptian | 1 | 0.29 |
| 1st Class | Mexican | 1 | 0.29 |
| 1st Class | Norwegian | 1 | 0.29 |
| 1st Class | Polish | 1 | 0.29 |
| 2nd Class | English | 145 | 51.06 |
| 2nd Class | Other - Multiple | 25 | 8.80 |
| 2nd Class | American | 24 | 8.45 |
| 2nd Class | Channel Islander | 12 | 4.23 |
| 2nd Class | Irish | 12 | 4.23 |
| 2nd Class | French | 11 | 3.87 |
| 2nd Class | Scottish | 8 | 2.82 |
| 2nd Class | Finnish | 6 | 2.11 |
| 2nd Class | Swedish | 6 | 2.11 |
| 2nd Class | Canadian | 5 | 1.76 |
| 2nd Class | South African | 4 | 1.41 |
| 2nd Class | Spanish | 4 | 1.41 |
| 2nd Class | Danish | 3 | 1.06 |
| 2nd Class | Italian | 3 | 1.06 |
| 2nd Class | Lithuanian | 2 | 0.70 |
| 2nd Class | Swiss | 2 | 0.70 |
| 2nd Class | Syrian/Lebanese | 2 | 0.70 |
| 2nd Class | Welsh | 2 | 0.70 |
| 2nd Class | Australian | 1 | 0.35 |
| 2nd Class | Belgian | 1 | 0.35 |
| 2nd Class | German | 1 | 0.35 |
| 2nd Class | Haitian | 1 | 0.35 |
| 2nd Class | Hungarian | 1 | 0.35 |
| 2nd Class | Japanese | 1 | 0.35 |
| 2nd Class | Norwegian | 1 | 0.35 |
| 2nd Class | Portuguese | 1 | 0.35 |
| 3rd Class | English | 112 | 15.80 |
| 3rd Class | Irish | 105 | 14.81 |
| 3rd Class | Swedish | 89 | 12.55 |
| 3rd Class | Syrian/Lebanese | 83 | 11.71 |
| 3rd Class | Other - Multiple | 69 | 9.73 |
| 3rd Class | Finnish | 52 | 7.33 |
| 3rd Class | Bulgarian | 33 | 4.65 |
| 3rd Class | Croatian | 28 | 3.95 |
| 3rd Class | Norwegian | 24 | 3.39 |
| 3rd Class | American | 23 | 3.24 |
| 3rd Class | Belgian | 22 | 3.10 |
| 3rd Class | Danish | 7 | 0.99 |
| 3rd Class | Scottish | 6 | 0.85 |
| 3rd Class | Welsh | 6 | 0.85 |
| 3rd Class | Canadian | 5 | 0.71 |
| 3rd Class | French | 5 | 0.71 |
| 3rd Class | Polish | 5 | 0.71 |
| 3rd Class | Swiss | 5 | 0.71 |
| 3rd Class | Bosnian | 4 | 0.56 |
| 3rd Class | Hong Kongese | 4 | 0.56 |
| 3rd Class | Italian | 4 | 0.56 |
| 3rd Class | Greek | 3 | 0.42 |
| 3rd Class | Channel Islander | 2 | 0.28 |
| 3rd Class | Chinese | 2 | 0.28 |
| 3rd Class | German | 2 | 0.28 |
| 3rd Class | Slovenian | 2 | 0.28 |
| 3rd Class | Australian | 1 | 0.14 |
| 3rd Class | Austrian | 1 | 0.14 |
| 3rd Class | Latvian | 1 | 0.14 |
| 3rd Class | Lithuanian | 1 | 0.14 |
| 3rd Class | Portuguese | 1 | 0.14 |
| 3rd Class | Sweedish | 1 | 0.14 |
| 3rd Class | Turkish | 1 | 0.14 |
# Average age by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
group_by(class) %>%
summarize(avg_age = mean(age), min_age = min(age), max_age = max(age)) %>%
kable(caption = "Average Age by Class",
col.names = c("Class", "Average Age", "Minimum Age", "Maximum Age"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Average Age | Minimum Age | Maximum Age |
|---|---|---|---|
| 1st Class | 39.12 | 0 | 71 |
| 2nd Class | 30.01 | 0 | 71 |
| 3rd Class | 25.12 | 0 | 74 |
# Survival rate by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, survived) %>%
kable(caption = "Survival Rate by Class",
col.names = c("Class", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Survived | Count | Percent |
|---|---|---|---|
| 1st Class | Lost | 123 | 37.96 |
| 1st Class | Saved | 201 | 62.04 |
| 2nd Class | Lost | 166 | 58.45 |
| 2nd Class | Saved | 118 | 41.55 |
| 3rd Class | Lost | 528 | 74.47 |
| 3rd Class | Saved | 181 | 25.53 |
# Survival rate by gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(gender, survived) %>%
kable(caption = "Survival Rate by Gender",
col.names = c("Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Gender | Survived | Count | Percent |
|---|---|---|---|
| Female | Lost | 127 | 27.25 |
| Female | Saved | 339 | 72.75 |
| Male | Lost | 690 | 81.08 |
| Male | Saved | 161 | 18.92 |
# Survival rate by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, gender) %>%
kable(caption = "Survival Rate by Class and Gender",
col.names = c("Class", "Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Gender | Survived | Count | Percent |
|---|---|---|---|---|
| 1st Class | Female | Lost | 5 | 3.47 |
| 1st Class | Female | Saved | 139 | 96.53 |
| 1st Class | Male | Lost | 118 | 65.56 |
| 1st Class | Male | Saved | 62 | 34.44 |
| 2nd Class | Female | Lost | 12 | 11.32 |
| 2nd Class | Female | Saved | 94 | 88.68 |
| 2nd Class | Male | Lost | 154 | 86.52 |
| 2nd Class | Male | Saved | 24 | 13.48 |
| 3rd Class | Female | Lost | 110 | 50.93 |
| 3rd Class | Female | Saved | 106 | 49.07 |
| 3rd Class | Male | Lost | 418 | 84.79 |
| 3rd Class | Male | Saved | 75 | 15.21 |
surv_classhist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, class)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class",
x = "Age Distribution", y = "Passenger Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
surv_agehist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
surv_ageclass_hist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
facet_wrap(~class, nrow=3) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class and Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
ctree <- ctree(survived ~ gender + class + age, data=datpass)
## Note: We are aware the "saved" and "lost" labels are switched in the first graph and are working to figure out why.
plot(ctree)
ggparty(ctree) +
geom_edge() +
geom_edge_label() +
geom_node_splitvar() +
geom_node_plot(gglist = list(geom_bar(aes(x = "", fill = survived),
position = position_fill()),
theme_bw(),
xlab("Survival"), ylab("Percentage")),
shared_axis_labels = TRUE,
legend_separator = TRUE,)
fares <- import(here("data", "avgfare.csv")) %>%
clean_names()
fares$accommodation <- as.factor(fares$accommodation)
fares$accommodation <- factor(fares$accommodation, levels = c("First-class parlor suite", "First-class cabin", "Second-class cabin", "Third-class cabin"))
p1921 <- (17.9/9.7)
fares$fare_1921 <- p1921*fares$fare_1912
fares$fare_1921 <- round(fares$fare_1921, 2)
p1931 <- (15.2/9.7)
fares$fare_1931 <- p1931*fares$fare_1912
fares$fare_1931 <- round(fares$fare_1931, 2)
p1941 <- (14.7/9.7)
fares$fare_1941 <- p1941*fares$fare_1912
fares$fare_1941 <- round(fares$fare_1941, 2)
p1951 <- (26.0/9.7)
fares$fare_1951 <- p1951*fares$fare_1912
fares$fare_1951 <- round(fares$fare_1951, 2)
p1961 <- (29.9/9.7)
fares$fare_1961 <- p1961*fares$fare_1912
fares$fare_1961 <- round(fares$fare_1961, 2)
p1971 <- (40.5/9.7)
fares$fare_1971 <- p1971*fares$fare_1912
fares$fare_1971 <- round(fares$fare_1971, 2)
p1981 <- (90.9/9.7)
fares$fare_1981 <- p1981*fares$fare_1912
fares$fare_1981 <- round(fares$fare_1981, 2)
p1991 <- (136.2/9.7)
fares$fare_1991 <- p1991*fares$fare_1912
fares$fare_1991 <- round(fares$fare_1991, 2)
p2001 <- (177.1/9.7)
fares$fare_2001 <- p2001*fares$fare_1912
fares$fare_2001 <- round(fares$fare_2001, 2)
p2011 <- (224.9/9.7)
fares$fare_2011 <- p2011*fares$fare_1912
fares$fare_2011 <- round(fares$fare_2011, 2)
p2021 <- (274.3/9.7)
fares$fare_2021 <- p2021*fares$fare_1912
fares$fare_2021 <- round(fares$fare_2021, 2)
fares_tidy <- fares %>%
pivot_longer(cols = starts_with("fare"),
names_to = "year",
names_prefix = "fare_",
values_to = "price", names_transform = list(year = as.integer))
fare_graph <- fares_tidy %>%
ggplot(aes(year, price, colour=accommodation)) +
geom_line() +
geom_point() +
scale_colour_brewer(palette="Spectral") +
facet_wrap(~ accommodation, 4, scales = "free") +
xlim(1912,2021) +
theme(panel.spacing = unit(1, "lines")) +
labs(y = "Price ($USD)", x = "Year", title = "Inflation-Adjusted Titanic Ticket Prices", subtitle = "From 1912 to 2021", colour = "Accommodation") +
theme_minimal()
ggplotly(fare_graph)
When taking inflation rates into consideration, we see that the average price for a first class cabin in 1912 was $150.00, which today would be $4,241.74
Chan, Chung-hong, Geoffrey CH Chan, Thomas J. Leeper, and Jason Becker. 2021. Rio: A Swiss-Army Knife for Data File I/O.
Wickham, Hadley, and Jim Hester. 2021. Readr: Read Rectangular Text Data. https://CRAN.R-project.org/package=readr.
Wickham, Hadley, and Evan Miller. 2021. Haven: Import and Export ’Spss’, ’Stata’ and ’Sas’ Files. https://CRAN.R-project.org/package=haven.